# author: Peter Edge
# 12/19/2016
# email: pedge@eng.ucsd.edu

# this is a snakemake Snakefile, written using snakemake 3.5.5

localrules: all

################################################################################
# USER CONFIG
# change this list to limit the chromosomes analyzed
chroms = ['chr{}'.format(x) for x in range(1,23)]+['chrX'] #,'chrY']
# edit these to point to the correct paths for binaries / jars
# or just the program name if you have them in your PATH
HAPCUT2_REPO = '/path/to/cloned/HapCUT2' # absolute path to cloned HapCUT2 git repo
REFERENCE    = 'data/reference.fa'#'/path/to/reference_genome'
HIC_FASTQ    = 'data/hic_{mate}.fastq' # path must end in _{mate}.fastq, with two files ending in _1.fastq _2.fastq both present
VCF_DIR      = 'data/VCFs'            # vcf files should be in this directory and formatted as chr1.vcf, chr2.vcf...
HAPCUT2      = 'HAPCUT2'
EXTRACTHAIRS = 'extractHAIRS'
BWA          = 'bwa'      # 0.7.12-r1044
SAMTOOLS     = 'samtools' # samtools 1-2, htslib 1.21
PICARD       = '/path/to/picard'   # picard version 2.8
BAMTOOLS     = 'bamtools' # version 2.4
################################################################################

import sys,os
utilities_dir = os.path.join(HAPCUT2_REPO,'utilities')
sys.path.append(utilities_dir)
HAPCUT2      = os.path.join(HAPCUT2_REPO,'build','HAPCUT2')
EXTRACTHAIRS = os.path.join(HAPCUT2_REPO,'build','extractHAIRS')

import HiC_repair

rule all:
    input:
        expand('output/{chrom}.hap',chrom=chroms)

# run HapCUT2 to assemble haplotypes from combined Hi-C + 10X haplotype fragments
rule run_hapcut2_hic:
    params: job_name = '{chrom}.hapcut2_hic',
    input:  frag_file = 'data/hic/{chrom}',
            vcf_file  = '%s/{chrom}.vcf' % VCF_DIR
    output: hap = 'output/{chrom}.hap',
            model_file = 'output/{chrom}.htrans_model'
    shell:
        '''
        {HAPCUT2} --fragments {input.frag_file} --vcf {input.vcf_file} --output {output.hap} --hic 1 --htrans_data_outfile {output.model_file}
        '''

# convert Hi-C bam files to haplotype fragment files
rule hic_extract_hairs:
    params: job_name    = 'extracthairs',
    input:  expand('data/hic_separated/hic.REF_{chrom}.bam',chrom=chroms)
    output: expand('data/hic/{chrom}',chrom=chroms)
    run:
        for chrom,bam,frag in zip(chroms,input,output):
            shell('{EXTRACTHAIRS} --HiC 1 --bam {bam} --VCF {VCF_DIR}/{chrom}.vcf >> {frag}')

# split Hi-C bam files by chromosome
rule hic_split_bams:
    params: job_name = 'hic_bamsplit',
            stub     = 'data/hic_separated/hic',
    input:  bam = 'data/hic_processed.bam'
    output: expand('data/hic_separated/hic.REF_{chrom}.bam',chrom=chroms)
    shell:
        '{BAMTOOLS} split -in {input} -reference -stub {params.stub}'

# picard MarkDuplicates on Hi-C reads
rule mark_duplicates:
    params: job_name = 'mark_duplicates_hic'
    input:  bam = 'data/temp/hic_sorted.bam'
    output: bam = 'data/hic_processed.bam',
            metrics = 'data/hic_processed.metrics'
    shell:
        '''java -jar {PICARD} MarkDuplicates READ_NAME_REGEX= null INPUT= {input.bam} OUTPUT= {output.bam} METRICS_FILE= {output.metrics} ASSUME_SORTED= true'''

# samtools fixmate and sort on Hi-C reads
rule fixmate_sort:
    params: job_name = 'fixmate_sort_hic'
    input:  'data/temp/hic_repaired.bam'
    output: temp('data/temp/hic_sorted.bam')
    shell:
        '''{SAMTOOLS} fixmate {input} - | {SAMTOOLS} sort -m 2G -@ 4 -T data/temp/hic_sorted -o {output} -'''

# repair chimeric Hi-C read pairs (from off-center Hi-C linker)
rule repair_chimeras:
    params: job_name = 'repair_chimeras_hic'
    input:  mate1 = 'data/temp/hic_raw_1.bam',
            mate2 = 'data/temp/hic_raw_2.bam'
    output: combined_mates = temp('data/temp/hic_repaired.bam')
    run:
        HiC_repair.repair_chimeras(input.mate1, input.mate2, output.combined_mates, min_mapq=20)

# align HiC fastq file to reference
rule align_fastq:
    params: job_name = 'align_hic'
    input:  ref = REFERENCE,
            idx = REFERENCE+'.bwt',
            fastq = HIC_FASTQ
    output: temp('data/temp/hic_raw_{mate}.bam')
    shell:
        '''
        {BWA} mem -t 4 -B 8 -M {REFERENCE} {input.fastq} | {SAMTOOLS} view -bS -o - - | {SAMTOOLS} sort -m 2G -@ 4 -T data/temp/hic_raw_{wildcards.mate} -n -o {output} -
        '''
        # credit to selvaraj et al for this line from their script

# index reference genome
rule index_genome:
    params: job_name = 'index_reference'
    input:  REFERENCE
    output: REFERENCE+'.bwt'
    shell:
        '{BWA} index {REFERENCE}'
